In [1]:
import pandas as pd
import numpy
import json
from collections import defaultdict
import scipy.stats
import math
import pywikibot
from matplotlib.pylab import style
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648
In [2]:
allrecs = pd.DataFrame.from_dict(json.load(open('helpers/world_cultures_shortcut.json','r')))
In [6]:
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()
retrieved = dict()
def english_label(qid):
if type(qid) is float:
if math.isnan(qid):
return None
#first see if we've done it
try:
return retrieved[qid]
except KeyError:
try:
page = pywikibot.ItemPage(wikidata, qid)
data = page.get()
lab = data['labels']['en']
retrieved[qid] = lab
return lab
except (KeyError, pywikibot.exceptions.NoPage):
retrieved[qid] = qid
return qid
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()
In [7]:
allrecs['citname'] = allrecs['citizenship'].apply(english_label)
allrecs['countryname'] = allrecs['country'].apply(english_label)
In [8]:
wikidatanames = set(allrecs['citname']).union(set(allrecs['countryname']))
In [9]:
def normname(name):
name = name.replace('*','')
try:
return {'Iran, Islamic Rep.': 'Iran',
'Korea, Rep.':'South Korea',
'Brunei Darussalam': 'Brunei',
'United States':'United States of America',
'Slovak Republic':'Slovakia',
'China':"People's Republic of China",
'People’s Republic of China':"People's Republic of China",
'Kyrgyz Republic': 'Kyrgyzstan',
'Russian Federation': 'Russia',
'Macedonia, FYR': 'Republic of Macedonia',
'Lao PDR':'Laos',
'Bahamas':'The Bahamas',
u'C\xf4te d\u2019Ivoire':u"C\xf4te d'Ivoire",
'Côte d’Ivoire':u"C\xf4te d'Ivoire",
'Plu. St.. of Bolivia':'Bolivia',
'Viet Nam':'Vietnam',
'Myanmar':'Burma',
'Former Yugoslav Republic of Macedonia':'Macedonia',
'Lao People’s Democratic Republic':'Laos',
'Bolivarian Republic of Venezuela':'Venezuela',
'Republic of Moldova':'Moldova',
'Central African Rep.':'Central African Republic',
'Syrian Arab Republic':'Syria',
'Republic of Tanzania':'Tanzania',
'Palestine, State of':'Palestine',
'Moldova (Republic of)':'Moldova',
'Sao Tome and Principe': u'Sao Tom\xe9 and Pr\xedncipe',
"Lao People's Democratic Republic":'Laos',
'Venezuela (Bolivarian Republic of)':'Venezuela',
'The former Yugoslav Republic of Macedonia':'Macedonia',
'Iran (Islamic Republic of)':'Iran',
'Congo (Democratic Republic of the)': u'Democratic Republic of the Congo',
'Congo':u'Republic of the Congo',
'Tanzania (United Republic of)':'Tanzania',
'Hong Kong, China (SAR)':"People's Republic of China",
'Russian Federation':'Russia',
'Korea (Republic of)':'South Korea',
'Bolivia (Plurinational State of)':'Bolivia'}[name]
except KeyError:
return name
In [10]:
wef = pd.io.html.read_html('http://reports.weforum.org/global-gender-gap-report-2014/rankings/')[0]
wef['Economy'] = wef['Economy'].apply(normname)
wefnames = set(wef['Economy'])
In [11]:
geidirty = pd.io.html.read_html('http://www.socialwatch.org/node/14367')[2]
gei = geidirty.iloc[3:,6:8]
gei.columns = ['Economy', 'Score']
gei = gei.dropna()
gei["Rank"] = gei['Score'].rank(ascending=False).apply(lambda x: int(x))
In [12]:
def country_sigi_extract(text_line):
'''put the first strings together as name and the first float as the sigi value'''
economy = ''
sigi_val = float()
for w in text_line.split(' '):
try:
sigi_val = float(w)
break
except ValueError:
if economy:
economy += ' ' + w
else:
economy = w
return economy, sigi_val
ec_sigi = dict([country_sigi_extract(text_line) for text_line in sigipdftext.split('\n')] )
sigi = pd.DataFrame.from_dict(ec_sigi, orient='index')
sigi['Economy'] = sigi.index
sigi['Economy'] = sigi['Economy'].apply(normname)
sigi['Score'] = 1-sigi[0]
sigi["Rank"] = sigi['Score'].rank(ascending=False).apply(lambda x: int(x))
In [13]:
gdidirty = pd.DataFrame.from_csv('helpers/foreign_indexes/Table_5__Gender-related_development_index.csv')
nar = gdidirty.iloc[1:,:3]
nar.columns = ['Economy', 'Score', 'Rank']
In [14]:
gdi = nar[(nar['Score'] != '..') & (nar['Rank'] != '—') ]
In [15]:
gdi['Score'] = gdi['Score'].apply(lambda x: float(x))
gdi['Rank'] = gdi['Rank'].apply(lambda x: int(x))
gdi.sort('Score')
Out[15]:
In [19]:
geinames = set(gei['Economy'].apply(normname))
print geinames
unknown = gdinames.difference(wikidatanames)
for uk in unknown:
print uk
In [18]:
gdinames = set(gdi['Economy'].apply(normname))
unknown = gdinames.difference(wikidatanames)
for uk in unknown:
print uk
In [20]:
siginames = set(sigi['Economy'])
unknown = siginames.difference(wikidatanames)
for uk in unknown:
print uk
In [21]:
def calibrate_rank_corr(foreign_index, short=False):
'''takes a foreign index and finds the window for which wigi most correlates with it'''
corr_df = pd.DataFrame(columns=['start_year', 'bios_count', 'spearman', 'spearman_p', 'mannwhitneyu', 'mannwhitneyu_p', 'ranksum', 'ranksum_p'])
if not short:
some_modern_history = range(1000, 1800, 100) + range(1800, 1980, 10)
else:
some_modern_history = [1900]
for start_year in some_modern_history:
modrecs = allrecs[(allrecs['dob'] >= start_year) &(allrecs['dob'] < 1990)]
cdf = modrecs[['country','citizenship','gender']]
def combine_economy(row):
cit = row['citizenship']
cunt = row['country']
return cit if cit else cunt
cdf['Economy_qid'] = cdf.apply(combine_economy,axis=1)
edf = cdf[cdf['Economy_qid'].apply(lambda x: x is not None)]
bios_count = len(edf)
edf['Economy'] = edf['Economy_qid'].apply(english_label)
country_perc = defaultdict(dict)
country_groups= edf.groupby(by='Economy')
for country, group in country_groups:
nonmale = group[group['gender'] != 'Q6581097']['gender'].count()
total = group['gender'].count()
nm_perc = nonmale / float(total)
country_perc[country]['Economy'] = country #for later on joining
country_perc[country]['Score'] = nm_perc #for later on joining
country_perc[country]['total']= total
wdf = pd.DataFrame.from_dict(country_perc, orient='index')
wefnames = set(foreign_index['Economy'])
wdf_matching = wdf[wdf['Economy'].apply(lambda x: x in wefnames)]
wdf_matching['Rank'] = wdf_matching['Score'].rank(ascending=False).apply(lambda x: int(x))
rank_compare = foreign_index.join(wdf_matching, on='Economy', how='left', rsuffix='-Wikidata')[['Economy','Rank','Rank-Wikidata','Score','Score-Wikidata']]
rank_compare['diff'] = rank_compare['Rank'] - rank_compare['Rank-Wikidata']
spearman_results = scipy.stats.spearmanr(rank_compare[['Rank','Rank-Wikidata']])
spearman = spearman_results[0]
spearman_p = spearman_results[1]
def scale_col(col):
num = col - min(col)
denom = max(col) - min(col)
return num / denom
rank_compare['Score_norm'] = scale_col(rank_compare['Score'])
rank_compare['Score_wikidata_norm'] = scale_col(rank_compare['Score-Wikidata'])
mannwhitneyu, mannwhitneyu_p = scipy.stats.mannwhitneyu(rank_compare['Score_norm'],rank_compare['Score_wikidata_norm'])
ranksum, ranksum_p = scipy.stats.ranksums(rank_compare['Score_norm'],rank_compare['Score_wikidata_norm'])
corr_df = corr_df.append(dict(start_year=start_year,
bios_count=bios_count,
spearman=spearman,
spearman_p = spearman_p,
mannwhitneyu = mannwhitneyu,
mannwhitneyu_p = mannwhitneyu_p,
ranksum = ranksum,
ranksum_p = ranksum_p), ignore_index=True)
return corr_df #todo just return the max spearman
In [22]:
sigi_corr_df = calibrate_rank_corr(sigi, short=False)
In [27]:
wef_corr_df = calibrate_rank_corr(wef, short=False)
In [ ]:
gdi_corr_df = calibrate_rank_corr(gdi, short=False)
In [ ]:
gei_corr_df = calibrate_rank_corr(gei, short=False)
In [ ]:
gei_corr_df
In [ ]:
sigi_corr_df
In [ ]:
gdi_corr_df
In [29]:
wef_corr_df
Out[29]:
In [ ]:
for df in [gdi_corr_df, sigi_corr_df, wef_corr_df]:
df.plot(x='start_year',y=['spearman', 'spearman_p'])
plt.show()
In [87]:
wef_corr_df.to_pickle('opensym/wefdf')
In [2]:
wef_corr_df = pd.read_pickle('opensym/wefdf')
In [9]:
fig, ax = plt.subplots(1, 1, figsize=(6,4))
wef_corr_df.plot(x='start_year',y=['spearman', 'spearman_p'],ax=ax)
ax.set_ylabel('Correlation coefficient')
ax.set_xlabel('Start Year')
ax.legend((r'Spearman $\rho$','Significance $p$'),loc=3)
fig.suptitle('WIGI-GGGI Rank Correlation by Start Year', size=24)
fig.subplots_adjust(top=0.88)
fig.savefig('opensym/spearman_evolution_gggi.png')
In [ ]:
corr_df.plot(x='start_year',y=['mannwhitneyu', 'mannwhitneyu_p'], secondary_y='mannwhitneyu_p')
In [ ]:
corr_df.plot(x='start_year',y=['ranksum', 'ranksum_p'], secondary_y='ranksum_p')
In [ ]:
wdf_matching.sort('Score',ascending=False).head()
In [ ]:
modrecs = allrecs[(allrecs['dob'] >=1890) &(allrecs['dob'] < 1990)]
cdf = modrecs[['country','citizenship','gender']]
def combine_economy(row):
cit = row['citizenship']
cunt = row['country']
return cit if cit else cunt
cdf['Economy_qid'] = cdf.apply(combine_economy,axis=1)
edf = cdf[cdf['Economy_qid'].apply(lambda x: x is not None)]
bios_count = len(edf)
edf['Economy'] = edf['Economy_qid'].apply(english_label)
country_perc = defaultdict(dict)
country_groups= edf.groupby(by='Economy')
for country, group in country_groups:
nonmale = group[group['gender'] != 'Q6581097']['gender'].count()
total = group['gender'].count()
nm_perc = nonmale / float(total)
country_perc[country]['Economy'] = country #for later on joining
country_perc[country]['Score'] = nm_perc #for later on joining
country_perc[country]['total']= total
wdf = pd.DataFrame.from_dict(country_perc, orient='index')
wdf_matching = wdf[wdf['Economy'].apply(lambda x: x in wefnames)]
wdf_matching['Rank'] = wdf_matching['Score'].rank(ascending=False).apply(lambda x: int(x))
rank_compare = wef.join(wdf_matching, on='Economy', how='left', rsuffix='_wikidata')[['Economy','Rank','Rank_wikidata','Score','Score_wikidata']]
rank_compare['diff'] = rank_compare['Rank'] - rank_compare['Rank_wikidata']
In [ ]:
pd.DataFrame.to_html(formatters=)
In [ ]:
print rank_compare.columns
In [ ]:
rank_compare.columns = ['Country', 'WEF Rank', 'Wikipedia Rank','WEF Score','Wikipedia Score','Rank Difference']
In [ ]:
rank_compare.sort('WEF Rank').head(10).to_html(index=False,formatters={'Wikipedia Score':lambda x: '{:0.4f}'.format(float(x))})
In [ ]:
rank_compare.sort('Wikipedia Rank').to_csv('helpers/foreign_indexes/WIGI_comparison.csv',encoding = 'utf-8', index=False, formatters={'Wikipedia Score':lambda x: '{:0.4f}'.format(float(x))})
In [ ]:
wdfc = wdf[wdf['total'] > 30]
In [ ]:
wdfc['Rank'] = wdfc['Score'].rank(ascending=False).apply(lambda x: int(x))
UNDP's Gender-related Development Index (GDI) and the Gender Empowerment Measure (GEM), introduced only in 1995. More recently, three new measures were developed: the Gender Equity Index (GEI) introduced by Social Watch in 2005, the Global Gender Gap Index (GGGI) developed by the World Economic Forum in 2006, and the Social Institutions and Gender Index (SIGI) of the OECD Development Centre from 2007.
In [ ]:
fiveway = wdfc[['Economy','Score','Rank']]
fiveway.index = fiveway['Economy']
for findex, ftext in zip([sigi,gdi,gei,wef], ['SIGI', 'GDI', 'GEI', 'GGGI']):
findex.index = findex['Economy']
fiveway = fiveway.join(findex[['Score','Rank']], how='outer', on = "Economy", rsuffix=" {}".format(ftext))
In [ ]:
fiveway.columns = fiveway.columns[:1] + ['Score WIGI','Rank WIGI'] + fiveway.columns[2:]
In [ ]:
fiveway.sort('Rank').to_csv('helpers/foreign_indexes/WIGI_comparison.csv',encoding = 'utf-8', index=False, formatters={'Wikipedia Score':lambda x: '{:0.4f}'.format(float(x))})
In [ ]:
Quite uncorrellated. That means that the data is not good, or that the world economic forum methods have little to do with the percentage of women born in those countries recorded semantically on a historic level. And /rho is high
In [ ]:
#not clean data bad pdf copy-paste BUT I think that the first string and then SIGI number have copied over
sigipdftext = '''Belgium 0.0016 0.0038 very low 0.0316 very low 0.0824 low 0.0000 very low 0.0000 very low
France 0.0034 0.1002 low 0.0000 very low 0.0828 low 0.0000 very low 0.0000 very low
Slovenia 0.0037 0.0031 very low 0.0891 very low 0.1023 low 0.0000 very low 0.0000 very low
Spain 0.0049 0.0856 low 0.0622 very low 0.1144 low 0.0000 very low 0.0000 very low
Serbia 0.0097 0.1094 low 0.1171 very low 0.1504 medium 0.0000 very low 0.0000 very low
Argentina 0.0107 0.0809 low 0.0148 very low 0.0691 very low 0.2048 low 0.0000 very low
Italy 0.0116 0.0025 very low 0.1029 very low 0.0966 low 0.0000 very low 0.1951 low
Cuba 0.0208 0.2420 medium 0.0871 very low 0.0000 very low 0.0000 very low 0.1951 low
Trinidad and Tobago 0.0236 0.2504 medium 0.1306 very low 0.0000 very low 0.0000 very low 0.1951 low
Czech Republic 0.0283 0.0013 very low 0.0956 very low 0.0855 low 0.0000 very low 0.3539 medium
Bosnia and Herzegovina 0.0333 0.2437 medium 0.0672 very low 0.1497 medium 0.2048 low 0.1951 low
Belarus 0.0336 0.0251 very low 0.3544 medium 0.0599 very low 0.0000 very low 0.1951 low
Mongolia 0.0345 0.0226 very low 0.2584 medium 0.1582 medium 0.2048 low 0.1951 low
Dominican Republic 0.0367 0.3691 medium 0.0958 very low 0.0118 very low 0.0000 very low 0.1951 low
Panama 0.0375 0.2344 low 0.0148 very low 0.0855 low 0.0000 very low 0.3539 medium
Bolivarian Republic of Venezuela 0.0389 0.2456 medium 0.0941 very low 0.0071 very low 0.0000 very low 0.3539 medium
Ecuador 0.0422 0.1374 low 0.3737 medium 0.1037 low 0.2048 low 0.0000 very low
Lithuania 0.0424 0.0013 very low 0.2795 medium 0.0931 low 0.0000 very low 0.3539 medium
Bulgaria 0.0449 0.1504 low 0.3926 medium 0.0988 low 0.0000 very low 0.1951 low
Brazil 0.0458 0.2316 low 0.1226 very low 0.0364 very low 0.1837 low 0.3539 medium
Cambodia 0.0477 0.0684 low 0.2601 medium 0.0000 very low 0.2028 low 0.3539 medium
El Salvador 0.0490 0.1066 low 0.2675 medium 0.1049 low 0.3885 medium 0.0000 very low
Costa Rica 0.0506 0.2513 medium 0.1544 low 0.0121 very low 0.4076 medium 0.0000 very low
Latvia 0.0511 0.0044 very low 0.3466 medium 0.1008 low 0.0000 very low 0.3539 medium
Plu. St.. of Bolivia 0.0579 0.3676 medium 0.3207 medium 0.0987 low 0.2048 low 0.0000 very low
Paraguay 0.0580 0.2880 medium 0.0440 very low 0.0291 very low 0.4076 medium 0.1951 low
South Africa 0.0599 0.0213 very low 0.2164 low 0.2196 medium 0.4076 medium 0.1951 low
Republic of Moldova 0.0664 0.3418 medium 0.2189 low 0.0000 very low 0.2048 low 0.3539 medium
Romania 0.0686 0.1134 low 0.1700 low 0.0994 low 0.0000 very low 0.5399 high
Azerbaijan 0.2403 0.1301 low 0.2057 low 0.8587 very high 0.1837 low 0.6093 high
Armenia 0.2428 0.1910 low 0.1853 low 0.9880 very high 0.2048 low 0.3539 medium
Ethiopia 0.2450 0.2820 medium 0.8662 very high 0.0878 low 0.5913 high 0.1951 low
Albania 0.2476 0.1822 low 0.2596 medium 0.8767 very high 0.4076 medium 0.4505 medium
Ukraine 0.0750 0.0414 very low 0.1517 low 0.2430 high 0.0000 very low 0.5399 high
Peru 0.0826 0.4053 medium 0.2096 low 0.0284 very low 0.4076 medium 0.1951 low
Colombia 0.0862 0.1748 low 0.1567 low 0.0663 very low 0.0000 very low 0.6093 high United
Republic of Tanzania 0.2504 0.7166 very high 0.5415 high 0.1746 medium 0.5913 high 0.2554 low
Lesotho 0.0876 0.4266 high 0.4112 medium 0.2116 medium 0.2048 low 0.0000 very low
Côte d’Ivoire 0.2537 0.4955 high 0.5895 high 0.1858 medium 0.5913 high 0.5399 high
Madagascar 0.1002 0.4889 high 0.3079 medium 0.0000 very low 0.2048 low 0.3539 medium
Turkey 0.1032 0.1585 low 0.1913 low 0.4036 high 0.0000 very low 0.5399 high
Timor-Leste 0.2550 0.3882 medium 0.5421 high 0.2271 medium 0.5913 high 0.6552 high
Iraq 0.2631 0.7035 very high 0.3347 medium 0.3834 high 0.5913 high 0.4601 medium
Morocco 0.1052 0.4610 high 0.3159 medium 0.1574 medium 0.3885 medium 0.1951 low
India 0.2650 0.6440 very high 0.3772 medium 0.5415 very high 0.5913 high 0.3539 medium
Thailand 0.1056 0.3770 medium 0.2935 medium 0.1533 medium 0.3885 medium 0.3539 medium
Benin 0.2780 0.2763 medium 0.4432 high 0.3677 high 0.5913 high 0.7953 very high
Honduras 0.1074 0.3891 medium 0.1044 very low 0.1443 medium 0.3885 medium 0.4505 medium
Cameroon 0.2803 0.5024 high 0.5333 high 0.2066 medium 0.7869 very high 0.4505 medium high
Burkina Faso 0.2819 0.5419 high 0.7257 very high 0.1910 medium 0.5913 high 0.4505 medium
Lebanon 0.2897 0.6143 very high 0.2488 medium 0.1639 medium 0.5913 high 0.7953 very high
Namibia 0.1173 0.1709 low 0.3522 medium 0.0668 very low 0.5913 high 0.2812 low
Kazakhstan 0.1196 0.0282 very low 0.2176 low 0.1126 low 0.4076 medium 0.6093 high
Myanmar 0.2935 0.4963 high 0.4891 high 0.0000 very low 0.5913 high 0.7953
Ghana 0.2988 0.3946 medium 0.5491 high 0.3136 high 0.8044 very high 0.5399 high
Pakistan 0.3013 0.6908 very high 0.4127 medium 0.6998 very high 0.4076 medium 0.4505 medium
People’s Republic of China 0.1310 0.2885 medium
Guatemala 0.1318 0.3953 medium
Rwanda 0.1339 0.2618 medium 0.1246 very low 0.5578 very high 0.4076 0.3213 medium 0.4082 medium 0.2566 high 0.2048 0.1392 medium 0.5913
Former Yugoslav Republic of Macedonia 0.1345 0.1803 low 0.3911
Jamaica 0.1350 0.0031 very low 0.2046 low
Mozambique 0.1375 0.4181 high 0.3793 medium
Zimbabwe 0.1392 0.5700 very high 0.3435 medium 0.2951
Tajikistan 0.1393 0.3182 medium 0.4138 medium 0.5075 medium 0.5666 medium 0.2812 low low 0.5399 high high 0.2554 low very high 0.4076 medium 0.0271 very low 0.0000 0.0000 very low 0.4076 high very high
Jordan 0.3119 0.5274 high 0.3150 medium 0.6790 very high 0.5913 high 0.6093 high
Guinea 0.3206 0.5413 high 0.9515 very high 0.2253 medium 0.3885 medium 0.4505 medium
Afghanistan 0.3224 0.7316 very high 0.5473 high 0.4644 very high 0.5913 high 0.4601 medium
Nepal 0.3229 0.1813 low 0.4083 medium 1.0000 very high 0.5913 high 0.2554 low
Central African Rep. 0.3285 0.5327 high 0.6135 high 0.0071 very low 0.5913 high 0.7953 very high
Bangladesh 0.3900 0.9730 very high 0.3323 medium 0.5831 very high 0.5913 high 0.4505 medium 0.2028 low 0.3539 medium
Nigeria 0.3911 0.6723 very high 0.4766 high 0.2494 high 0.7626 very high 0.7953 very high
Mauritania 0.3954 0.7556 very high 0.9939 very high 0.1746 medium 0.5913 high 0.1951 low
Gabon 0.4022 0.6457 very high 0.5308 high 0.1746 medium 0.7869 very high 0.8140 very high
Syrian Arab Republic 0.4162 0.6914 very high 0.2598 medium 0.4312 high 0.5913 high 1.0000 very high
Lao People’s Democratic Republic 0.1445 0.2606 medium 0.5321 high 0.0506 very low 0.4076 medium 0.4505 medium
Haiti 0.1466 0.5613 very high 0.5010 high 0.0000 very low 0.2048 low 0.3539 medium
Uzbekistan 0.1475 0.2477 medium 0.2966 medium 0.1884 medium 0.5913 high 0.4505 medium
Indonesia 0.1532 0.5612 very high 0.2511 medium 0.3891 high 0.1837 low 0.4505 medium
Nicaragua 0.1595 0.6303 very high 0.1868 low 0.1082 low 0.3885 medium 0.4505 medium
Kyrgyzstan 0.1598 0.1879 low 0.3771 medium 0.2624 high 0.5913 high 0.4505 medium
Burundi 0.1662 0.5602 very high 0.5055 high 0.1746 medium 0.4076 medium 0.2554 low
Angola 0.1719 0.4599 high 0.5041 high 0.0791 low 0.5913 high 0.1951 low
Philippines 0.1765 0.4929 high 0.2597 medium 0.1392 medium 0.5913 high 0.4505 medium
Togo 0.1860 0.3696 medium 0.5488 high 0.1326 medium 0.5913 high 0.3539 medium
Viet Nam 0.1865 0.3374 medium 0.1857 low 0.4967 very high 0.4076 medium 0.6093 high
Sri Lanka 0.1894 0.4203 high 0.2681 medium 0.1483 medium 0.6207 high 0.5399 high
Democratic Republic of the Congo 0.4276 0.5169 high 0.5338 high 0.0691 very low 0.9582 very high 0.8140 very high
Egypt 0.4280 0.6665 very high 0.7373 very high 0.3741 high 0.5913 high 0.8140 very high
Niger 0.4415 1.0000 very high 0.4059 medium 0.1746 medium 0.5913 high 0.8140 very high
Zambia 0.4489 0.5149 high 0.5624 high 0.1746 medium 1.0000 very high 0.7953 very high
Somalia 0.4594 0.5958 very high 0.9905 very high 0.0891 low 0.7626 very high 0.6093 high
Chad 0.4665 0.9705 very high 0.8185 very high 0.0014 very low 0.5913 high 0.6093 high
Mali 0.5164 0.8309 very high 1.0000 very high 0.3048 high 0.4076 medium 0.7953 very high very high
Gambia 0.5240 0.5131 high 0.8509 very high 0.0000 very low 1.0000 very high 0.7953
Sudan 0.5550 0.8382 very high 0.9781 very high 0.1426 medium 0.8163 very high 0.6552 high very high 0.3414 high 0.5913 high 1.0000 very high'''
In [ ]:
modrecs = allrecs[(allrecs['dob'] >= 1900) &(allrecs['dob'] < 1990)]
cdf = modrecs[['country','citizenship','gender']]
def combine_economy(row):
cit = row['citizenship']
cunt = row['country']
return cit if cit else cunt
cdf['Economy_qid'] = cdf.apply(combine_economy,axis=1)
edf = cdf[cdf['Economy_qid'].apply(lambda x: x is not None)]
bios_count = len(edf)
edf['Economy'] = edf['Economy_qid'].apply(english_label)
country_perc = defaultdict(dict)
country_groups= edf.groupby(by='Economy')
for country, group in country_groups:
nonmale = group[group['gender'] != 'Q6581097']['gender'].count()
total = group['gender'].count()
nm_perc = nonmale / float(total)
country_perc[country]['Economy'] = country #for later on joining
country_perc[country]['Score'] = nm_perc #for later on joining
country_perc[country]['total']= total
wdf = pd.DataFrame.from_dict(country_perc, orient='index')
In [ ]:
wdf[wdf['total']>100].tail(100)
In [ ]:
wdf.ix["People's Republic of China"]
In [ ]:
#magnus' special format
nonzero = wdf[(wdf['Score'] != 0.0) & (wdf['total']> 100)]
magnusformt = zip(nonzero['Economy'],nonzero['Score'])
json.dump(magnusformt, open('Magnus Gender analysis/wigi_gender.json','w'))
In [ ]:
!less Magnus\ Gender\ analysis/wigi_gender.json
In [ ]: